Progress Check

So I've managed to train a model for a single epoch. Now I can check what (if anything) it has managed to pick up so far.


In [28]:
import os, re
import random
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.callbacks import TensorBoard

In [2]:
ROOT_PATH = r'C:\Users\caleb\Documents\Data Science\welcome-to-night-vale'
DATA_PATH = os.path.join(ROOT_PATH, 'data')
MODEL_PATH = os.path.join(DATA_PATH, 'models')
LOG_PATH = os.path.join(ROOT_PATH, 'logs')

In [3]:
model = load_model(os.path.join(MODEL_PATH, 'wtnv-keras-model.hd5'))

Next step is to generate text from random values. To do this, I need to have the same alphabet and so forth.


In [4]:
def load_text(filepath):
    '''Load text file from DATA_PATH'''
    with open(os.path.join(DATA_PATH, filepath),
              'r', encoding='utf-8') as f:
        text = f.read()
        return text

In [10]:
def get_alphabet(text):
    # lowercase text
    text = text.lower()

    # create mapping of unique chars to integers, and a reverse mapping
    chars = sorted(list(set(text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for i, c in enumerate(chars))

    # summarize the loaded data
    n_chars = len(text)
    n_vocab = len(chars)
    
    return char_to_int, int_to_char

In [11]:
def pre_processing(text, seq_length=100):
	'''Preprocesses text file for model.
	   Lowercases text, converts to integer arrays of length seq_length.

	   Args:
	  	text - text file to be processed
	  	seq_length - length of character sequences to be considered 
	   				 in the training set
		
	   Returns:
		X - Array of integers representing character sequences from
			the training text with length seq_length.
			X.shape = (n_chars - seq_length, seq_length, 1)
		y - Array of integers representing next characters for each
			sequence in X.
			y.shape = (n_chars - seq_length, n_vocab)'''

	# lowercase text
	text = text.lower()

	# create mapping of unique chars to integers, and a reverse mapping
	chars = sorted(list(set(text)))
	char_to_int = dict((c, i) for i, c in enumerate(chars))
	int_to_char = dict((i, c) for i, c in enumerate(chars))

	# summarize the loaded data
	n_chars = len(text)
	n_vocab = len(chars)
	print("Total Characters:", n_chars)
	print("Total Vocab:", n_vocab)

	# prepare the dataset of input to output pairs encoded as integers
	dataX = []
	dataY = []
	for i in range(0, n_chars - seq_length, 1):
		seq_in = text[i:i + seq_length]
		seq_out = text[i + seq_length]
		dataX.append([char_to_int[char] for char in seq_in])
		dataY.append(char_to_int[seq_out])

	n_patterns = len(dataX)
	print("Total Patterns:", n_patterns)

	# reshape X to be [samples, time steps, features]
	X = np.reshape(dataX, (n_patterns, seq_length, 1))

	# normalize
	X = X / n_vocab

	# one hot encode the output variable
	y = np_utils.to_categorical(dataY)

	return X, y

In [19]:
text = load_text('Welcome To Night Vale.txt')
char_to_int, int_to_char = get_alphabet(text)
n_vocab = len(char_to_int)
X, y = pre_processing(text)


Total Characters: 1628468
Total Vocab: 192
Total Patterns: 1628368

In [40]:
index = random.randint(0, X.shape[0])
print(''.join([int_to_char[i] for i in (X[index,:,0] * n_vocab).astype(np.int32)]))


hat any work in grove park was making way for a new swing set, picnic area, and bloodstone circle, w

In [50]:
print(''.join([int_to_char[i] for i in random.sample(range(n_vocab), 100)]))


з̱ ºzдͯр̳éd9p̩̏aî̜̝(i©̹<̠̥ 5ü8k♫ͨу̮f̃‽̙͚͓͊̐ц:͎͛а̤̾]̫ͬо̻͉͈͂6uэͥ.ьxж/̭̎͗̒♪̄n̞¼ш̲ͮ͒̔ͅâ“̺̣̉̍в-̖̅̀ю‘̦̪ͣ”$

In [131]:
# generate sequence of random characters
seq = np.random.choice(range(n_vocab), size=(1, 100, 1))

for i in range(1000):    
    # predict next character in sequence
    next_char = np.random.choice(range(n_vocab), 
                                 p=model.predict(seq)[0,:])

    # append next character, drop first character and reshape
    seq = np.append(seq, next_char)[1:].reshape(1,100,1)

print(''.join([int_to_char[i] for i in seq[0,:,0]]))


i n it  r  sss  d  g 
j  s  ,  я e  y  s n e i e e  ys 
j os  r        ssst     g   e c   m   


st 

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: